{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "cc4ebed2",
   "metadata": {},
   "source": [
    "Interannotation agreement, all genre labels, TikTok\n",
    "percent agreement per genre; cohen's kappa, though falsely assumes independence; krippendorff's for whole labeling process"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2acdaa84",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "\n",
    "Tess_excel = pd.ExcelFile(\"tiktok_grouplabels_Tess.xlsx\")\n",
    "Tan_excel = pd.ExcelFile(\"tiktok_grouplabels_Tan.xlsx\")\n",
    "\n",
    "print(\"Sheets in Tess Dataset:\")\n",
    "print(Tess_excel.sheet_names)\n",
    "\n",
    "print(\"\\nSheets in Tan Dataset:\")\n",
    "print(Tan_excel.sheet_names)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5b73e3b6",
   "metadata": {},
   "outputs": [],
   "source": [
    "#find remove deadlink rows\n",
    "tess_x_rows = Tess[Tess.iloc[:, 16] == 'x']\n",
    "tan_x_rows = Tan[Tan.iloc[:, 16] == 'x']\n",
    "\n",
    "print(\"Indices with 'x' in Tess:\")\n",
    "print(tess_x_rows.index.tolist())\n",
    "\n",
    "print(\"\\nIndices with 'x' in Tan:\")\n",
    "print(tan_x_rows.index.tolist())\n",
    "\n",
    "indices_match = tess_x_rows.index.equals(tan_x_rows.index)\n",
    "print(f\"\\nDo the indices match between Tess and Tan? {indices_match}\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "be42d6b3",
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "Tess_cleaned = Tess.drop(tess_x_rows.index)\n",
    "Tan_cleaned = Tan.drop(tan_x_rows.index)\n",
    "\n",
    "print(\"Tess shape after removal:\", Tess_cleaned.shape)\n",
    "print(\"Tan shape after removal:\", Tan_cleaned.shape)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "95d7802f",
   "metadata": {},
   "outputs": [],
   "source": [
    "#clean match column names\n",
    "tess_columns = Tess_cleaned.columns[18:43]\n",
    "tan_columns = Tan_cleaned.columns[18:43]\n",
    "\n",
    "column_titles_match = tess_columns.equals(tan_columns)\n",
    "\n",
    "print(\"Column titles in Tess (18-42):\")\n",
    "print(tess_columns.tolist())\n",
    "\n",
    "print(\"\\nColumn titles in Tan (18-42):\")\n",
    "print(tan_columns.tolist())\n",
    "\n",
    "print(f\"\\nDo the column titles match between Tess and Tan? {column_titles_match}\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2e54c4ae",
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "column_range = range(18, 43)\n",
    "\n",
    "def standardize_labels(df):\n",
    "    for col in df.columns[column_range]:\n",
    "        # Find all variations of '1' and convert them to integer 1, others to 0\n",
    "        df[col] = df[col].apply(lambda x: 1 if str(x).strip() in ['1', '1.0', '1'] else 0)\n",
    "    return df\n",
    "\n",
    "Tess_cleaned = standardize_labels(Tess_cleaned)\n",
    "Tan_cleaned = standardize_labels(Tan_cleaned)\n",
    "\n",
    "def print_counts(df, df_name):\n",
    "    for col in df.columns[column_range]:\n",
    "        counts = df[col].value_counts()\n",
    "        print(f\"Column '{col}' in {df_name} - Counts of 1s and 0s:\")\n",
    "        print(counts)\n",
    "        print()\n",
    "\n",
    "print(\"Counts in Tess_cleaned:\")\n",
    "print_counts(Tess_cleaned, \"Tess_cleaned\")\n",
    "\n",
    "print(\"Counts in Tan_cleaned:\")\n",
    "print_counts(Tan_cleaned, \"Tan_cleaned\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4202fc15",
   "metadata": {},
   "outputs": [],
   "source": [
    "#cohens kappa\n",
    "\n",
    "from sklearn.metrics import cohen_kappa_score\n",
    "\n",
    "def calculate_agreement(Tan_df, Tess_df):\n",
    "    results = []\n",
    "    for col in Tan_df.columns[column_range]:\n",
    "        tan_labels = Tan_df[col]\n",
    "        tess_labels = Tess_df[col]\n",
    "\n",
    "        tan_1s = tan_labels.sum()\n",
    "        tess_1s = tess_labels.sum()\n",
    "        tan_0s = len(tan_labels) - tan_1s\n",
    "        tess_0s = len(tess_labels) - tess_1s\n",
    "\n",
    "        percent_agreement = (tan_labels == tess_labels).mean() * 100\n",
    "\n",
    "        kappa = cohen_kappa_score(tan_labels, tess_labels)\n",
    "\n",
    "        results.append({\n",
    "            'Column': col,\n",
    "            'Tan 1s': tan_1s,\n",
    "            'Tess 1s': tess_1s,\n",
    "            'Tan 0s': tan_0s,\n",
    "            'Tess 0s': tess_0s,\n",
    "            'Percent Agreement': percent_agreement,\n",
    "            'Cohens Kappa': kappa \n",
    "        })\n",
    "    \n",
    "    return results\n",
    "\n",
    "agreement_results = calculate_agreement(Tan_cleaned, Tess_cleaned)\n",
    "\n",
    "for result in agreement_results:\n",
    "    print(f\"Column: {result['Column']}\")\n",
    "    print(f\"Tan 1s: {result['Tan 1s']}, Tess 1s: {result['Tess 1s']}\")\n",
    "    print(f\"Tan 0s: {result['Tan 0s']}, Tess 0s: {result['Tess 0s']}\")\n",
    "    print(f\"Percent Agreement: {result['Percent Agreement']:.2f}%\")\n",
    "    print(f\"Cohens Kappa: {result['Cohens Kappa']:.4f}\")  # Removed the apostrophe\n",
    "    print(\"-\" * 40)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "51f9a9b4",
   "metadata": {},
   "outputs": [],
   "source": [
    "#krippendorf"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6d57175d",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Define the indexes of the genre columns\n",
    "genre_indexes = [18, 23, 24, 26, 29, 30, 34, 36, 39, 40, 41, 42]\n",
    "\n",
    "Tess_genres = Tess_cleaned.iloc[:, genre_indexes].copy()\n",
    "Tan_genres = Tan_cleaned.iloc[:, genre_indexes].copy()\n",
    "\n",
    "\n",
    "genre_columns = [f'G{i+1}' for i in range(len(genre_indexes))]\n",
    "\n",
    "Tess_genres.columns = genre_columns\n",
    "Tan_genres.columns = genre_columns\n",
    "\n",
    "print(Tess_genres.head())\n",
    "print(Tan_genres.head())\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b657360b",
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "def get_selected_genres(row):\n",
    "    return [col for col in row.index if row[col] == 1]\n",
    "\n",
    "Tess_labels = Tess_genres.apply(get_selected_genres, axis=1)\n",
    "Tan_labels = Tan_genres.apply(get_selected_genres, axis=1)\n",
    "\n",
    "combined_df = pd.DataFrame({\n",
    "    'Tess_labels': Tess_labels,\n",
    "    'Tan_labels': Tan_labels\n",
    "})\n",
    "\n",
    "print(combined_df.head())\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ad6ff9cd",
   "metadata": {},
   "outputs": [],
   "source": [
    "from itertools import combinations\n",
    "\n",
    "genres = [f'G{i+1}' for i in range(12)]\n",
    "\n",
    "single_combinations = [(genre,) for genre in genres]  # 12 combinations\n",
    "\n",
    "pair_combinations = [tuple(sorted(pair)) for pair in combinations(genres[:-1], 2)]  # 55 combinations\n",
    "=\n",
    "all_combinations = single_combinations + pair_combinations\n",
    "\n",
    "combination_mapping = {comb: idx for idx, comb in enumerate(all_combinations)}\n",
    "\n",
    "total_combinations = len(combination_mapping)\n",
    "print(f\"Total combinations: {total_combinations}\")\n",
    "for comb, idx in list(combination_mapping.items())[:10]:  # Print the first 10 for inspection\n",
    "    print(f\"Combination: {comb}, Identifier: {idx}\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6c461b0c",
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "def encode_genre_selection(selection, mapping):\n",
    "    selection_tuple = tuple(sorted(selection))\n",
    "    return mapping.get(selection_tuple, -1)\n",
    "\n",
    "combined_df['Tess_encoded'] = combined_df['Tess_labels'].apply(lambda x: encode_genre_selection(x, combination_mapping))\n",
    "combined_df['Tan_encoded'] = combined_df['Tan_labels'].apply(lambda x: encode_genre_selection(x, combination_mapping))\n",
    "\n",
    "print(combined_df[['Tess_labels', 'Tess_encoded', 'Tan_labels', 'Tan_encoded']].head())\n",
    "\n",
    "unmapped = combined_df[(combined_df['Tess_encoded'] == -1) | (combined_df['Tan_encoded'] == -1)]\n",
    "\n",
    "if not unmapped.empty:\n",
    "    print(\"Unmapped combinations found:\")\n",
    "    print(unmapped[['Tess_labels', 'Tess_encoded', 'Tan_labels', 'Tan_encoded']])\n",
    "else:\n",
    "    print(\"All combinations are successfully mapped.\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "cc5318b9",
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "empty_selection_count = combined_df[(combined_df['Tess_encoded'] == -1) | (combined_df['Tan_encoded'] == -1)].shape[0]\n",
    "\n",
    "print(f\"Number of rows with empty selections: {empty_selection_count}\")\n",
    "\n",
    "cleaned_df = combined_df[(combined_df['Tess_encoded'] != -1) & (combined_df['Tan_encoded'] != -1)]\n",
    "\n",
    "rows_removed = combined_df.shape[0] - cleaned_df.shape[0]\n",
    "print(f\"Number of rows removed: {rows_removed}\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6439dc81",
   "metadata": {},
   "outputs": [],
   "source": [
    "import krippendorff\n",
    "\n",
    "reliability_data = [\n",
    "    cleaned_df['Tess_encoded'].values,\n",
    "    cleaned_df['Tan_encoded'].values\n",
    "]\n",
    "\n",
    "alpha = krippendorff.alpha(reliability_data=reliability_data, level_of_measurement='nominal')\n",
    "\n",
    "print(f\"Krippendorff's Alpha for the cleaned data: {alpha}\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2c61f80a",
   "metadata": {},
   "outputs": [],
   "source": [
    "#ok now a second method, in which we do not treat the labeling of pranks as previous process/remove pranks"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "131ccea5",
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "genre_indexes = [18, 23, 24, 26, 29, 30, 34, 36, 39, 40, 41, 42]\n",
    "\n",
    "Tess_genres = Tess_cleaned.iloc[:, genre_indexes].copy()\n",
    "Tan_genres = Tan_cleaned.iloc[:, genre_indexes].copy()\n",
    "\n",
    "genre_columns = [f'G{i+1}' for i in range(len(genre_indexes))]\n",
    "\n",
    "Tess_genres.columns = genre_columns\n",
    "Tan_genres.columns = genre_columns\n",
    "\n",
    "print(Tess_genres.head())\n",
    "print(Tan_genres.head())\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e088b80e",
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "def get_selected_genres(row):\n",
    "    return [col for col in row.index if row[col] == 1]\n",
    "\n",
    "Tess_labels = Tess_genres.apply(get_selected_genres, axis=1)\n",
    "Tan_labels = Tan_genres.apply(get_selected_genres, axis=1)\n",
    "\n",
    "combined_df = pd.DataFrame({\n",
    "    'Tess_labels': Tess_labels,\n",
    "    'Tan_labels': Tan_labels\n",
    "})\n",
    "\n",
    "\n",
    "print(combined_df.head())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d0dfbf6c",
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "genres = [f'G{i+1}' for i in range(12)]  \n",
    "\n",
    "single_combinations = [(genre,) for genre in genres] \n",
    "\n",
    "pair_combinations = [tuple(sorted(pair)) for pair in combinations(genres[:-1], 2)]  \n",
    "\n",
    "\n",
    "all_combinations = single_combinations + pair_combinations\n",
    "\n",
    "combination_mapping = {comb: idx for idx, comb in enumerate(all_combinations)}\n",
    "\n",
    "total_combinations = len(combination_mapping)\n",
    "print(f\"Total combinations: {total_combinations}\")\n",
    "for comb, idx in list(combination_mapping.items())[:10]:  \n",
    "    print(f\"Combination: {comb}, Identifier: {idx}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0398f455",
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "def encode_genre_selection(selection, mapping):\n",
    "\n",
    "    selection_tuple = tuple(sorted(selection))\n",
    "\n",
    "    return mapping.get(selection_tuple, -1)\n",
    "\n",
    "\n",
    "combined_df['Tess_encoded'] = combined_df['Tess_labels'].apply(lambda x: encode_genre_selection(x, combination_mapping))\n",
    "combined_df['Tan_encoded'] = combined_df['Tan_labels'].apply(lambda x: encode_genre_selection(x, combination_mapping))\n",
    "\n",
    "print(combined_df[['Tess_labels', 'Tess_encoded', 'Tan_labels', 'Tan_encoded']].head())\n",
    "\n",
    "unmapped = combined_df[(combined_df['Tess_encoded'] == -1) | (combined_df['Tan_encoded'] == -1)]\n",
    "\n",
    "if not unmapped.empty:\n",
    "    print(\"Unmapped combinations found:\")\n",
    "    print(unmapped[['Tess_labels', 'Tess_encoded', 'Tan_labels', 'Tan_encoded']])\n",
    "else:\n",
    "    print(\"All combinations are successfully mapped.\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f43bdeae",
   "metadata": {},
   "outputs": [],
   "source": [
    "empty_selection_count = combined_df[(combined_df['Tess_encoded'] == -1) | (combined_df['Tan_encoded'] == -1)].shape[0]\n",
    "\n",
    "print(f\"Number of rows with empty selections: {empty_selection_count}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "aa5f1784",
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "reliability_data = [\n",
    "    combined_df['Tess_encoded'].values,\n",
    "    combined_df['Tan_encoded'].values\n",
    "]\n",
    "\n",
    "alpha = krippendorff.alpha(reliability_data=reliability_data, level_of_measurement='nominal')\n",
    "\n",
    "print(f\"Krippendorff's Alpha including -1 as its own category: {alpha}\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "37e04220",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
